# This script integrates the CAPP predictors and the genomic risk scores. 
# This script gives the example code for integrating the CAPP model with the PRS, but the input datasets can be easily adapted for the integration of the CAPP model with the newborn and childhood MRSs (with or without the PRS).
# Newborn MRS - data found in IOWBC_MRS_data.xlsx, sheet: "IOWBC nMRS"
# Childhood MRS - data found in IOWBC_MRS_data.xlsx, sheet: "IOWBC cMRS"
# Integrated datasets for the CAPP+nMRS, CAPP+cMRS and CAPP+PRS+nMRS, and CAPP+PRS+cMRS can be found in IOWBC_CAPP_integrated_data.xlsx 
# - note. in IOWBC_CAPP_integrated_data.xlsx, data for the integrated newborn and childhood MRS with the CAPP model can be found within the same spreadsheet. Therefore, if developing the CAPP+nMRS integrated model, the cMRS will need to be deleted from the dataset, and vice versa.
# The integrated model was developed using the same machine learning algorithm and training dataset characteristics as the best CAPP model 
# - i.e. SVM algorithm (RBF kernel) trained on the complete training dataset, oversampled 300%, undersampled
# To ensure sufficient numbers of individuals for training, a new training-test set split was performed rather than subsetting those with complete data from the initial CAPE training set:
# Integrate data >> standardise >> complete, 300% oversampled & undersampled dataset >> linear svm 

# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score
from imblearn.over_sampling import ADASYN
from numpy import argmax, arange
import pickle
# Classifiers
from sklearn.svm import SVC

# Set working directory
os.chdir("/../../..")

#### Define function to evaluate performance measures
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return
	
########################################
### PRESCHOOL MODEL - CLINICAL + PRS ###
########################################
# Load cleaned, unstandardised features used in the CAPE model - data found in IOWBC_data.xlsx, sheet: "Early life data"
data = pd.read_csv("Preschool_QC_1368Ids.csv", index_col=False)
del data['Unnamed: 0']

# Subset the 12 variables included in the preschool model
data1 = data[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR', 'SES', 'Asthma_10YR']]

# Add PRS data to the dataset - data found in IOWBC_PRS_data.xlsx, sheet: "IOWBC PRS"
PRS = pd.read_csv("PRS_116snp_Asthma10YR_Adjusted.csv", index_col=False)
del PRS['In_Regression']
# 924 IDs 
PRS.rename(columns={'IID':'Study_ID'}, inplace=True)
PRS = PRS.dropna()
Counter(PRS.Asthma_10YR)
# 908 samples should have PRS data - 767 controls, 141 cases

# Recode the Asthma_10YR variable from the PRS analysis to: 0=controls (1 in PRS analysis), 1=cases (2 in PRS analysis)
PRS['Asthma_10YR'] = np.where(PRS['Asthma_10YR']==2.0, 1, 0)

all_data = data1.merge(PRS, how='outer', on='Study_ID')
all_data.isnull().sum()

# Save integrated dataset - data found in IOWBC_integrated_data.xlsx, sheet: "CAPP+PRS"
all_data.to_csv("CAPP+PRS_model_data_1368ID.csv")

all_data = all_data.dropna()
# 401 IDs remain

all_data = all_data[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','SDS_BMI_4', 'PRS','Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR','SES', 'Asthma_10YR_x']]

# Split data into training and test sets
#create data_features
complete_subset_features = all_data.drop(['Asthma_10YR_x'], axis=1)

#create data_outcome
complete_subset_outcome = all_data['Asthma_10YR_x']

# Split dataset into training set and test set: 66.6% training and 33.3% test
X_train, X_test, y_train, y_test = train_test_split(complete_subset_features, complete_subset_outcome,
                                                    stratify=complete_subset_outcome, 
                                                    test_size=0.333, shuffle=True, random_state=123)
													
# Training set (n=267, asthma=38, no asthma=229)	Test set (n=134, asthma=19, no asthma=115)

# Save the original train/test set IDs
Train_IDs = X_train.iloc[:,0]
Train_IDs = Train_IDs.to_frame()
Test_IDs = X_test.iloc[:,0]
Test_IDs = Test_IDs.to_frame()

# delete Study Id columns from training and test sets
del X_train['Study_ID']
del X_test['Study_ID']

# Standardise training and test sets
cont = X_train[['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1',  'SDS_BMI_4', 'PRS']]

scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:6]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'PRS'))
cat_train = X_train.iloc[:,6:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)
SXY_train = pd.concat([Train_IDs.reset_index(drop=True), SX_train], axis=1)
SXY_train = pd.concat([SXY_train, y_train.reset_index(drop=True)], axis=1)
#SXY_train.to_csv("CAPP_PRS_standardised_training_dataset_267ID.csv") - data found in IOWBC_CAPP_integrated_data.xlsx, sheet: "CAPP+PRS standardised training"

cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:6]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'SDS_BMI_4', 'PRS'))
cat_test = X_test.iloc[:,6:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)
SXY_test = pd.concat([Test_IDs.reset_index(drop=True), SX_test], axis=1)
SXY_test = pd.concat([SXY_test, y_test.reset_index(drop=True)], axis=1)
#SXY_test.to_csv("CAPP_PRS_standardised_test_dataset_134ID.csv") - data found in IOWBC_CAPP_integrated_data.xlsx, sheet: "CAPP+PRS standardised test"

# Over/undersampled training data as required - 300% oversampling, undersampling
Counter(SXY_train.Asthma_10YR_x)
#Counter({0: 229, 1: 38})

# Sampling stategy = no. of cases after oversampling/ no. of controls before oversampling
# Oversample cases by 300%
OSX_train, Oy_train = ADASYN(sampling_strategy=(154/229), random_state=123).fit_resample(SX_train, y_train)
print('Original dataset shape %s' % Counter(Oy_train))
#Original dataset shape Counter({0: 229, 1: 153})

# Convert arrays into dataframes
OSX_train_df = pd.DataFrame(data=OSX_train)
Oy_train_df = pd.DataFrame(data=Oy_train)
# Format synthetic data
# Continuous variables rounded to 6dp
OSX_cont = OSX_train_df.iloc[:, 0:6].round(6)

# Categorical variables rounded to integers
OSX_cat = OSX_train_df.iloc[:, 6:OSX_train_df.shape[1]].round()

# Combine into one oversampled, formatted dataset
Oversampled_300_train = pd.concat([OSX_cont, OSX_cat], axis=1)
Oversampled_300_train = pd.concat([Train_IDs.reset_index(drop=True), Oversampled_300_train], axis=1)
Oversampled_300_train.columns = ['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','SDS_BMI_4', 'PRS','Total.Bf.duration', 'Wheeze_4YR', 'Cough_4YR', 'Noct_Symp_4YR', 'Atopy_4YR', 'Polysensitisation_4YR','SES']
Oy_train_df.columns =['Asthma_10YR_x']
Oversampled_300 = pd.concat([Oversampled_300_train, Oy_train_df], axis=1)

# Undersample the controls 
s1 = Oversampled_300.loc[Oversampled_300['Asthma_10YR_x'] == 1]
s0 = Oversampled_300.loc[Oversampled_300['Asthma_10YR_x'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:152,]
s1 = s1.iloc[:152,]
SXY_train_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
SXY_train_OU = shuffle(SXY_train_OU, random_state=123)
print('Original dataset shape %s' % Counter(SXY_train_OU.Asthma_10YR_x))
#Original dataset shape Counter({1: 152, 0: 152})

X_train = SXY_train_OU.iloc[:,1:-1]
y_train = SXY_train_OU.iloc[:,-1]

### Model development ###
# Define a linear svm classifier 
# Random search
clf = SVC(kernel='linear', probability=True, random_state=123)
C_range = np.logspace(-3,2,100)
param_grid = dict(C=C_range)

random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
									n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
best_parameters = random_search.best_params_
print(best_parameters)
#{'C': 0.02310129700083159
best_score = random_search.best_score_
print(best_score)
#0.7763157894736842

# Grid search
clf = SVC(kernel='linear', probability=True, random_state=123)
C_range = np.arange(0.01,5.01,0.01)
param_grid = dict(C=C_range)
grid_search = GridSearchCV(clf, scoring='balanced_accuracy', param_grid=param_grid, cv=StratifiedKFold(5), n_jobs=16)
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
# Get Grid search results
Candidates = len(grid_search.cv_results_['params'])
print(Candidates)
# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 0.85}
best_score = grid_search.best_score_
print(best_score)
#0.7796052631578947

# Build best model
best_clf = SVC(kernel='linear', C=0.85, probability=True, random_state=123)

# Fit optimised model
best_clf.fit(X_train,y_train)

### Training set Performance
y_train_pred = best_clf.predict(X_train)
probs = best_clf.predict_proba(X_train)
preds = probs[:,1]
performance(y_train, y_train_pred, preds)
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.9016187673130194

# Evaluate model in test set
probs = best_clf.predict_proba(SX_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.7899313501144164

y_pred = best_clf.predict(SX_test)
cm_test = confusion_matrix(y_test, y_pred)	
print (cm_test)
performance(y_test, y_pred,preds)


#### Identify optimal threshold based on Youden's index ####
test_probs = best_clf.predict_proba(SX_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))
# 0.37586164641108366

# Obtain classifications based on optimal threshold cutoff
probs_opt = best_clf.predict_proba(SX_test)
SX_test['preds'] = probs_opt[:,1]
pred_opt = SX_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
performance(y_test, pred_opt,SX_test['preds'])		

# save the model to disk
import pickle
filename = 'CAPP_PRS_linearSVM_COU300.sav'
pickle.dump(best_clf, open(filename, 'wb'))